Libraries

library(readxl)
library(summarytools)
#devtools::install_github("boxuancui/DataExplorer")
library(DataExplorer)
library(kableExtra)
library(dplyr)
library(reshape2)
library(lubridate)
library(tidyverse)
#library(textshape)
library(magrittr)
library(ggplot2)
library(factoextra)
library(FactoMineR)
library(cluster)
library(clValid)
library(ape)
library(dendextend)
library(clustertend)
library(NbClust)
library(ggiraphExtra)
library(plotly)

Load Data

setwd("/Users/danielesilva/Documents/Data Science Projects/Data_Science/Clustering Spotify")

#load data
#font: http://organizeyourmusic.playlistmachinery.com/
#Variables:
#Genre - the genre of the track
#Year - the release year of the recording. Note that due to vagaries of releases, re-releases, re-issues and general madness, sometimes the release years are not what you'd expect.
#Added - the earliest date you added the track to your collection.
#Beats Per Minute (BPM) - The tempo of the song.
#Energy - The energy of a song - the higher the value, the more energtic. song
#Danceability - The higher the value, the easier it is to dance to this song.
#Loudness (dB) - The higher the value, the louder the song.
#Liveness - The higher the value, the more likely the song is a live recording.
#Valence - The higher the value, the more positive mood for the song.
#Length - The duration of the song.
#Acousticness - The higher the value the more acoustic the song is.
#Speechiness - The higher the value the more spoken word the song contains.
#Popularity - The higher the value the more popular the song is.
#Duration - The length of the song.

data = read_xlsx("my_top20.xlsx")

Exploratory Data

head(data)
## # A tibble: 6 x 13
##       N TITLE      ARTIST   RELEASE               BPM ENERGY DANCE  LOUD VALENCE
##   <dbl> <chr>      <chr>    <dttm>              <dbl>  <dbl> <dbl> <dbl>   <dbl>
## 1     1 Somebody … Gotye    2011-01-01 00:00:00   129     52    87    -7      75
## 2     2 Blinding … The Wee… 2020-03-20 00:00:00   171     73    51    -6      33
## 3     3 Café       Vitão    2019-03-08 00:00:00    92     50    78    -5      49
## 4     4 Graveto -… Marília… 2020-01-10 00:00:00    93     63    71    -5      40
## 5     5 Sei Lá     Projota  2019-04-25 00:00:00   152     43    66    -9      96
## 6     6 Don't Sta… Dua Lipa 2020-03-27 00:00:00   124     79    79    -5      68
## # … with 4 more variables: LENGTH <dttm>, ACOUSTIC <dbl>, POP. <dbl>, RND <dbl>
summary(data) %>% kable() %>% kable_styling()
N TITLE ARTIST RELEASE BPM ENERGY DANCE LOUD VALENCE LENGTH ACOUSTIC POP. RND
Min. : 1.00 Length:100 Length:100 Min. :1966-08-05 00:00:00 Min. : 80.0 Min. :17.00 Min. :33.00 Min. :-15.00 Min. :10.00 Min. :1899-12-31 02:00:00 Min. : 0.00 Min. :30.00 Min. : 45
1st Qu.: 25.75 Class :character Class :character 1st Qu.:2018-02-16 00:00:00 1st Qu.:102.8 1st Qu.:59.75 1st Qu.:59.50 1st Qu.: -6.25 1st Qu.:49.00 1st Qu.:1899-12-31 02:50:45 1st Qu.: 7.75 1st Qu.:65.75 1st Qu.:2526
Median : 50.50 Mode :character Mode :character Median :2019-08-16 00:00:00 Median :123.5 Median :72.00 Median :69.00 Median : -5.00 Median :64.00 Median :1899-12-31 03:09:30 Median :19.00 Median :72.50 Median :5306
Mean : 50.50 NA NA Mean :2015-09-24 07:12:00 Mean :126.0 Mean :69.58 Mean :67.69 Mean : -5.54 Mean :62.79 Mean :1899-12-31 03:14:28 Mean :28.22 Mean :72.65 Mean :4908
3rd Qu.: 75.25 NA NA 3rd Qu.:2019-12-13 00:00:00 3rd Qu.:146.2 3rd Qu.:82.25 3rd Qu.:78.00 3rd Qu.: -4.00 3rd Qu.:80.00 3rd Qu.:1899-12-31 03:35:00 3rd Qu.:46.25 3rd Qu.:83.00 3rd Qu.:6728
Max. :100.00 NA NA Max. :2020-06-30 00:00:00 Max. :196.0 Max. :95.00 Max. :96.00 Max. : 2.00 Max. :97.00 Max. :1899-12-31 05:04:00 Max. :94.00 Max. :97.00 Max. :9945
#glimpse(data) %>% kable() %>% kable_styling()

#st_options(plain.ascii = FALSE)

print(dfSummary(data, graph.magnif = 0.75), method = 'render')

Data Frame Summary

data

Dimensions: 100 x 13
Duplicates: 0
No Variable Stats / Values Freqs (% of Valid) Graph Valid Missing
1 N [numeric] Mean (sd) : 50.5 (29) min < med < max: 1 < 50.5 < 100 IQR (CV) : 49.5 (0.6) 100 distinct values 100 (100.0%) 0 (0.0%)
2 TITLE [character] 1. 3 Batidas - Ao Vivo 2. A Gente Fez Amor - Ao Viv 3. Adore You 4. Aí Eu Bebo - Ao Vivo 5. Amor de Que 6. Barzinho Aleatório - Ao V 7. Bebaça - Ao Vivo 8. Bebi Minha Bicicleta (Cor 9. Blinding Lights 10. BRABA [ 90 others ]
1(1.0%)
1(1.0%)
1(1.0%)
1(1.0%)
1(1.0%)
1(1.0%)
1(1.0%)
1(1.0%)
1(1.0%)
1(1.0%)
90(90.0%)
100 (100.0%) 0 (0.0%)
3 ARTIST [character] 1. Marília Mendonça 2. Vitão 3. Dua Lipa 4. Eminem 5. Zé Neto & Cristiano 6. Anitta 7. Bastille 8. Ed Sheeran 9. Harry Styles 10. Henrique & Juliano [ 61 others ]
5(5.0%)
4(4.0%)
3(3.0%)
3(3.0%)
3(3.0%)
2(2.0%)
2(2.0%)
2(2.0%)
2(2.0%)
2(2.0%)
72(72.0%)
100 (100.0%) 0 (0.0%)
4 RELEASE [POSIXct, POSIXt] min : 1966-08-05 med : 2019-08-16 max : 2020-06-30 range : 53y 10m 25d 83 distinct values 100 (100.0%) 0 (0.0%)
5 BPM [numeric] Mean (sd) : 126 (27.3) min < med < max: 80 < 123.5 < 196 IQR (CV) : 43.5 (0.2) 56 distinct values 100 (100.0%) 0 (0.0%)
6 ENERGY [numeric] Mean (sd) : 69.6 (17.3) min < med < max: 17 < 72 < 95 IQR (CV) : 22.5 (0.2) 54 distinct values 100 (100.0%) 0 (0.0%)
7 DANCE [numeric] Mean (sd) : 67.7 (13.9) min < med < max: 33 < 69 < 96 IQR (CV) : 18.5 (0.2) 47 distinct values 100 (100.0%) 0 (0.0%)
8 LOUD [numeric] Mean (sd) : -5.5 (2.3) min < med < max: -15 < -5 < 2 IQR (CV) : 2.2 (-0.4) 13 distinct values 100 (100.0%) 0 (0.0%)
9 VALENCE [numeric] Mean (sd) : 62.8 (20.9) min < med < max: 10 < 64 < 97 IQR (CV) : 31 (0.3) 56 distinct values 100 (100.0%) 0 (0.0%)
10 LENGTH [POSIXct, POSIXt] min : 1899-12-31 02:00:00 med : 1899-12-31 03:09:30 max : 1899-12-31 05:04:00 range : 3H 4M 0S 72 distinct values 100 (100.0%) 0 (0.0%)
11 ACOUSTIC [numeric] Mean (sd) : 28.2 (25.1) min < med < max: 0 < 19 < 94 IQR (CV) : 38.5 (0.9) 60 distinct values 100 (100.0%) 0 (0.0%)
12 POP. [numeric] Mean (sd) : 72.7 (11.4) min < med < max: 30 < 72.5 < 97 IQR (CV) : 17.2 (0.2) 37 distinct values 100 (100.0%) 0 (0.0%)
13 RND [numeric] Mean (sd) : 4907.8 (2795.6) min < med < max: 45 < 5306.5 < 9945 IQR (CV) : 4202 (0.6) 100 distinct values 100 (100.0%) 0 (0.0%)

Generated by summarytools 0.9.9 (R version 4.0.4)
2021-04-04

#DataExplorer

plot_str(data)

plot_missing(data)

plot_histogram(data)

plot_correlation(data, type = 'continuous')

#plot_bar(data) 

#plot_boxplot(data) 

Tratamento de Dados

names(data) = make.names(names(data))
names(data)
##  [1] "N"        "TITLE"    "ARTIST"   "RELEASE"  "BPM"      "ENERGY"  
##  [7] "DANCE"    "LOUD"     "VALENCE"  "LENGTH"   "ACOUSTIC" "POP."    
## [13] "RND"
#remove duplicates
data_numeric = data %>%
        unique() %>%
#transform lenght
        mutate(DURATION = ms(format(as.POSIXct(strptime(LENGTH, "%Y-%m-%d %H:%M:%S",tz="")), format= "%H:%M")))%>%
#keep just numeric variables
        select(-N,-TITLE,-ARTIST,-RELEASE,-RND,-POP.,-LENGTH) 
#create y for vtreat
#        mutate(y = rowSums(across(where(is.numeric))))

str(data_numeric)
## tibble [100 × 7] (S3: tbl_df/tbl/data.frame)
##  $ BPM     : num [1:100] 129 171 92 93 152 124 98 138 130 95 ...
##  $ ENERGY  : num [1:100] 52 73 50 63 43 79 59 28 48 82 ...
##  $ DANCE   : num [1:100] 87 51 78 71 66 79 82 58 73 55 ...
##  $ LOUD    : num [1:100] -7 -6 -5 -5 -9 -5 -6 -9 -7 -4 ...
##  $ VALENCE : num [1:100] 75 33 49 40 96 68 51 81 50 56 ...
##  $ ACOUSTIC: num [1:100] 55 0 5 29 52 1 69 94 57 12 ...
##  $ DURATION:Formal class 'Period' [package "lubridate"] with 6 slots
##   .. ..@ .Data : num [1:100] 5 20 10 51 10 3 29 7 59 54 ...
##   .. ..@ year  : num [1:100] 0 0 0 0 0 0 0 0 0 0 ...
##   .. ..@ month : num [1:100] 0 0 0 0 0 0 0 0 0 0 ...
##   .. ..@ day   : num [1:100] 0 0 0 0 0 0 0 0 0 0 ...
##   .. ..@ hour  : num [1:100] 0 0 0 0 0 0 0 0 0 0 ...
##   .. ..@ minute: num [1:100] 4 3 3 2 3 3 3 2 2 2 ...
#scale data
data_scale = as.data.frame(scale(data_numeric))

ggplot(stack(data_scale), aes(x=ind, y=values)) + geom_boxplot(aes(fill=ind))

#outliers in energy and loud

#Title to rownames

rownames(data_scale) = data$TITLE

Choose the optimal number of cluster

set.seed(1990)

#os dados são clusterizaveis?
hopkins(data_scale, n = nrow(data_scale)-1)
## $H
## [1] 0.3868532
res.pca <- PCA(data_scale,  graph = FALSE)
# Visualize
fviz_screeplot(res.pca, addlabels = TRUE, ylim = c(0, 50))

# results for variables
var <- get_pca_var(res.pca)
# Contributions of variables to PC1
fviz_contrib(res.pca, choice = "var", axes = 1, top = 10)

# Contributions of variables to PC2
fviz_contrib(res.pca, choice = "var", axes = 2, top = 10)

# Control variable colors using their contributions to the principle axis
fviz_pca_var(res.pca, col.var="contrib",
             gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"),
             repel = TRUE # Avoid text overlapping
             ) + theme_minimal() + ggtitle("Variables - PCA")

# 1 - Elbow

fviz_nbclust(data_scale, kmeans, method = "wss", k.max = 24) + theme_minimal() + ggtitle("the Elbow Method")

# 2 - Gap

gap_stat <- clusGap(data_scale, FUN = kmeans, nstart = 30, K.max = 24, B = 50)
fviz_gap_stat(gap_stat) + theme_minimal() + ggtitle("fviz_gap_stat: Gap Statistic")

# 3 - Silhouette

fviz_nbclust(data_scale, kmeans, method = "silhouette", k.max = 24) + theme_minimal() + ggtitle("The Silhouette Plot")

# 4 - Sum of Squares 

#The above example would group the data into two clusters, centers = 2, and attempt #multiple initial configurations, reporting on the best one. For example, as this #algorithm is sensitive to the initial positions of the cluster centroids adding nstart #= 30 will generate 30 initial configurations and then average all the centroid #results.

km2 <- kmeans(data_scale, 2, nstart = 30)
km3 <- kmeans(data_scale, 3, nstart = 30)
km4 <- kmeans(data_scale, 4)
km5 <- kmeans(data_scale, 5)
km6 <- kmeans(data_scale, 6)
km7 <- kmeans(data_scale, 7)
km8 <- kmeans(data_scale, 8)
km9 <- kmeans(data_scale, 9)
km10 <- kmeans(data_scale, 10)

ssc <- data.frame(
  kmeans = c(2,3,4,5,6,7,8,9,10),
  within_ss = c(mean(km2$withinss), mean(km3$withinss), mean(km4$withinss), mean(km5$withinss), mean(km6$withinss), mean(km7$withinss), mean(km8$withinss), mean(km9$withinss), mean(km10$withinss)),
  between_ss = c(km2$betweenss, km3$betweenss, km4$betweenss, km5$betweenss, km6$betweenss, km7$betweenss, km8$betweenss, km9$betweenss, km10$betweenss)
)

ssc %<>% gather(., key = "measurement", value = value, -kmeans)


ssc %>% ggplot(., aes(x=kmeans, y=log10(value), fill = measurement)) + geom_bar(stat = "identity", position = "dodge") + ggtitle("Cluster Model Comparison") + xlab("Number of Clusters") + ylab("Log10 Total Sum of Squares") + scale_x_discrete(name = "Number of Clusters", limits = c("0", "2", "3", "4", "5", "6", "7", "8", "9", "10"))

# 5 - NbClust 

res.nbclust <- NbClust(data_scale, distance = "euclidean",
                  min.nc = 2, max.nc = 9, 
                  method = "complete", index ="all")

## *** : The Hubert index is a graphical method of determining the number of clusters.
##                 In the plot of Hubert index, we seek a significant knee that corresponds to a 
##                 significant increase of the value of the measure i.e the significant peak in Hubert
##                 index second differences plot. 
## 

## *** : The D index is a graphical method of determining the number of clusters. 
##                 In the plot of D index, we seek a significant knee (the significant peak in Dindex
##                 second differences plot) that corresponds to a significant increase of the value of
##                 the measure. 
##  
## ******************************************************************* 
## * Among all indices:                                                
## * 7 proposed 2 as the best number of clusters 
## * 4 proposed 3 as the best number of clusters 
## * 1 proposed 4 as the best number of clusters 
## * 2 proposed 5 as the best number of clusters 
## * 7 proposed 6 as the best number of clusters 
## * 2 proposed 9 as the best number of clusters 
## 
##                    ***** Conclusion *****                            
##  
## * According to the majority rule, the best number of clusters is  2 
##  
##  
## *******************************************************************
factoextra::fviz_nbclust(res.nbclust) + theme_minimal() + ggtitle("NbClust's optimal number of clusters")
## Warning in if (class(best_nc) == "numeric") print(best_nc) else if
## (class(best_nc) == : the condition has length > 1 and only the first element
## will be used
## Warning in if (class(best_nc) == "matrix") .viz_NbClust(x, print.summary, : the
## condition has length > 1 and only the first element will be used
## Warning in if (class(best_nc) == "numeric") print(best_nc) else if
## (class(best_nc) == : the condition has length > 1 and only the first element
## will be used
## Warning in if (class(best_nc) == "matrix") {: the condition has length > 1 and
## only the first element will be used
## Among all indices: 
## ===================
## * 2 proposed  0 as the best number of clusters
## * 1 proposed  1 as the best number of clusters
## * 7 proposed  2 as the best number of clusters
## * 4 proposed  3 as the best number of clusters
## * 1 proposed  4 as the best number of clusters
## * 2 proposed  5 as the best number of clusters
## * 7 proposed  6 as the best number of clusters
## * 2 proposed  9 as the best number of clusters
## 
## Conclusion
## =========================
## * According to the majority rule, the best number of clusters is  2 .

Clustering

intern <- clValid(data_scale, nClust = 2:24, 
              clMethods = c("hierarchical","kmeans","pam","clara"), validation = "internal")
# Summary
summary(intern) %>% kable() %>% kable_styling()
## 
## Clustering Methods:
##  hierarchical kmeans pam clara 
## 
## Cluster sizes:
##  2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 
## 
## Validation Measures:
##                                   2        3        4        5        6        7        8        9       10       11       12       13       14       15       16       17       18       19       20       21       22       23       24
##                                                                                                                                                                                                                                          
## hierarchical Connectivity    2.9290  14.9460  19.0956  22.2385  36.2270  38.7056  54.7960  69.6940  72.1202  73.4536  87.0024  91.6425  94.1131  98.5024 100.9813 102.8948 104.6071 113.9536 116.3798 118.4131 119.2004 121.4599 122.9460
##              Dunn            0.3282   0.2925   0.2962   0.2962   0.2687   0.2687   0.2315   0.2022   0.2022   0.2022   0.2481   0.2481   0.2611   0.2611   0.2646   0.2646   0.2646   0.3362   0.3362   0.3362   0.3362   0.3362   0.3362
##              Silhouette      0.3055   0.2607   0.2219   0.2054   0.1563   0.1354   0.1495   0.1360   0.1228   0.1177   0.1485   0.1523   0.1273   0.1233   0.1298   0.1327   0.1271   0.1551   0.1476   0.1444   0.1462   0.1448   0.1442
## kmeans       Connectivity   28.0599  56.5659  70.6885  78.1079  83.8242  81.9790  91.8079  98.8040 101.2929 103.1262 113.5730 114.8782 112.1492 113.7115 117.8000 122.4063 128.5587 133.7075 137.0837 141.1226 146.1909 149.8587 147.7857
##              Dunn            0.2239   0.1438   0.1480   0.1711   0.2130   0.2163   0.2175   0.2246   0.2246   0.2246   0.2162   0.2162   0.2725   0.2709   0.2709   0.2723   0.2378   0.2828   0.2828   0.2828   0.2838   0.2838   0.2850
##              Silhouette      0.2401   0.1449   0.1716   0.1598   0.1604   0.1718   0.1740   0.1857   0.1807   0.1740   0.1673   0.1661   0.1699   0.1746   0.1730   0.1690   0.1598   0.1631   0.1617   0.1580   0.1619   0.1576   0.1727
## pam          Connectivity   52.8937  67.2933 101.6187  93.7992 108.5433 105.4698 106.6952 110.8313 112.4298 114.7060 115.1587 119.2746 124.5937 126.9611 130.3425 131.8520 135.5048 140.7603 143.1270 147.2710 154.3119 158.4560 152.8210
##              Dunn            0.1290   0.1342   0.1562   0.1480   0.1279   0.1279   0.1533   0.1406   0.1502   0.1673   0.1673   0.1673   0.1673   0.1949   0.1949   0.2081   0.2081   0.2081   0.2081   0.2081   0.2111   0.2424   0.2111
##              Silhouette      0.1946   0.1346   0.0980   0.1273   0.1247   0.1116   0.1424   0.1232   0.1221   0.1206   0.1300   0.1278   0.1247   0.1261   0.1327   0.1368   0.1383   0.1409   0.1397   0.1391   0.1337   0.1330   0.1427
## clara        Connectivity   63.3198  66.9361  92.1540 109.1377  88.2468 100.7560 111.7020 112.4694 121.3897 124.5119 125.5841 119.7817 137.0821 127.9401 144.2286 134.6952 147.3790 144.5032 144.0798 151.0274 155.8020 151.0313 154.0948
##              Dunn            0.0877   0.1436   0.1201   0.1323   0.1466   0.1325   0.1343   0.1423   0.1543   0.1817   0.1963   0.2184   0.2268   0.1997   0.1947   0.1982   0.1947   0.2081   0.2111   0.2666   0.2080   0.2666   0.1920
##              Silhouette      0.1510   0.1360   0.0970   0.1064   0.1222   0.1211   0.1260   0.1326   0.1191   0.1215   0.0948   0.1177   0.1209   0.1232   0.1208   0.1187   0.1171   0.1225   0.1281   0.1306   0.1158   0.1139   0.1394
## 
## Optimal Scores:
## 
##              Score  Method       Clusters
## Connectivity 2.9290 hierarchical 2       
## Dunn         0.3362 hierarchical 19      
## Silhouette   0.3055 hierarchical 2
# Dendogram
d <- dist(data_scale, method = "euclidean")
res.hc <- hclust(d, method = "ward.D2" )
# Cut tree into 6 groups
grp <- cutree(res.hc, k = 6)
# Visualize
#plot(res.hc, cex = 0.5) # plot tree

# Color labels by specifying the number of cluster (k)
colors = RColorBrewer::brewer.pal(6,"Set2")

dend = as.dendrogram(res.hc)
labels_cex(dend) = 0.5
dend %>% set("labels_col", value = colors, k=6) %>% 
          plot(main = "Color labels \nper cluster", cex = 1)

plot(as.phylo(res.hc), type = "fan", tip.color = colors[grp],
     label.offset = 1, cex = 0.5)

#clara

# Execution of k-means with k=5
clara_clust <- clara(data_scale, 6, samples = 100)
fviz_cluster(clara_clust, data = data_scale, ellipse.type = "norm", geom = "text", 
             labelsize = 8, palette = colors, show.clust.cent = T) + theme_minimal() +
        ggtitle("k = 6")

Resultados

data_clusters = as.data.frame(data) %>% 
        mutate(Cluster = as.factor(grp)) %>% 
        mutate(DURATION = ms(format(as.POSIXct(strptime(LENGTH, "%Y-%m-%d %H:%M:%S",tz="")), format= "%H:%M")))
        
aggregate(data = data_clusters[,c(5:11,14:15)], . ~ Cluster, mean) %>% kable() %>% kable_styling()
Cluster BPM ENERGY DANCE LOUD VALENCE LENGTH ACOUSTIC DURATION
1 118.8571 49.00000 74.00000 -7.571429 76.85714 -2209063449 63.285714 15.857143
2 134.5625 76.75000 47.00000 -5.125000 39.25000 -2209063365 33.750000 28.500000
3 113.3077 72.69231 75.53846 -5.230769 65.61538 -2209063329 9.307692 8.615385
4 102.9545 69.09091 70.95455 -5.590909 55.50000 -2209063091 24.181818 40.909091
5 144.1944 76.36111 73.63889 -4.500000 79.38889 -2209064387 23.416667 35.222222
6 113.6667 28.83333 50.83333 -11.000000 30.16667 -2209061010 57.166667 16.500000
data_df <- as.data.frame(data_scale) %>% rownames_to_column()
cluster_pos <- as.data.frame(grp) %>% rownames_to_column()

colnames(cluster_pos) <- c("rowname", "cluster")

data_scale_cl <- inner_join(cluster_pos, data_df)
## Joining, by = "rowname"
ggRadar(data_scale_cl[-1], aes(group = cluster), rescale = FALSE, 
        legend.position = "none", size = 1, interactive = FALSE, use.label = TRUE) +
        facet_wrap(~cluster) + scale_y_discrete(breaks = NULL) + # don't show ticks
        theme(axis.text.x = element_text(size = 10)) + 
        scale_fill_manual(values = rep(colors, nrow(data_scale_cl))) +
        scale_color_manual(values = rep(colors, nrow(data_scale_cl))) +
        ggtitle("Music Attributes per Cluster")

#arrange data for plots

data_plot = melt(data_clusters, id.vars = c("N","Cluster"), 
                 measure.vars = c("BPM", "ENERGY", "DANCE",  "LOUD", "VALENCE",
                                  "ACOUSTIC"),
                 variable.name = "VAR",
                 value.name = "Value") 

p_ALL=  ggplot(data = data_plot, aes(x=Value, fill=Cluster)) +
        geom_density(alpha=0.5) + 
        scale_fill_brewer(palette = "Set2") +
        facet_wrap(. ~ VAR, scales = "free")
        

subplot(ggplotly(p_ALL))

References

https://towardsdatascience.com/10-tips-for-choosing-the-optimal-number-of-clusters

http://www.sthda.com/english/wiki/beautiful-dendrogram-visualizations-in-r-5-must-known-methods-unsupervised-machine-learning

http://www.sthda.com/english/wiki/wiki.php?id_contents=7930